Installing Packages and setting up the environment
source activate py27
source activate py35 source activate py36
conda update matplotlib
conda install pandas-datareader --update
Creating and working with Series and DataFrame
In [1]:
import pandas as pd
ids = [100, 200, 300, 301, 308]
names = ['Ali', 'Veli', 'Ayse', 'Fatma', 'Gamze']
surnames = ['Yilmaz', 'Gorali', 'Tasci', 'Bakkaloglu', 'Yilmaz']
ages = [27,32,19,28,32]
gender = ['M','M','F','F','F']
city = ['Istanbul', 'Istanbul', 'Ankara', 'Istanbul', 'Izmir']
number_plate = [('Adana','01'), ('Eskisehir','26'), ('Istanbul', '34'), ('Ankara', '06'), ('Izmir', '35'), ('Denizli','20') ]
s = pd.Series(names)
In [2]:
df = pd.DataFrame(data={'Name': names, 'Surname': surnames, 'Age': ages, 'Gender': gender, 'City': city},
index=ids)
#df1 = df[[3,0,2,1]]
df1 = df
df = pd.DataFrame(number_plate, columns=['City','Plate'])
df2 = df
In [3]:
df1
Out[3]:
In [4]:
df3 = pd.DataFrame(data={'Name': names, 'Surname': surnames},index=range(1,len(names)+1))
df3['Surname']
Out[4]:
In [5]:
df2
Out[5]:
In [6]:
data = [('Kadikoy', 40.001, 29.37), ('Etiler', 41.002, 29.38), ('Yesilkoy', 40.376, 28.97) ]
mahalle = pd.DataFrame(data, columns=['Place','Lat','Lon'])
mahalle[['Lat','Lon','Place']]
Out[6]:
In [7]:
aapl = pd.read_csv("aapl.csv", index_col=0, parse_dates=True)
aapl
Out[7]:
In [8]:
u = pd.read_clipboard()
In [9]:
u
Out[9]:
In [50]:
pd.merge(df1, df2, on='City', how='inner').sort_values('Name')
Out[50]:
In [146]:
pd.merge(df1, df2, how='outer').sort_values('Plate')
Out[146]:
In [113]:
df1.sort_values('Age')
Out[113]:
In [114]:
df1.sort_values('City')
Out[114]:
In [149]:
df1['Surname'].value_counts().sort_values()
Out[149]:
In [150]:
df1['Gender'].value_counts().sort_values()
Out[150]:
In [153]:
df1.index.get_loc(300)
Out[153]:
In [154]:
df1.at[100, 'Name']
Out[154]:
In [157]:
print(df1)
df1.iat[0,1]
Out[157]:
In [159]:
df1[(df1.Age<30) & (df1.City == 'Istanbul')]
Out[159]:
Subframe
In [174]:
sdf1 = df1[['Name','Surname']].ix[100:300]
Out[174]:
In [193]:
s =pd.Series([3.2,3,1,10,4],index=['A','B','C','D','E'])
print(s)
q = s.reindex(['A','C','E','F'])
print(q)
u = pd.DataFrame({'s': s, 'q': q})
In [203]:
#letters = [(c,chr(c)) for c in range(65,91) ]
letters = [chr(c) for c in range(65,91) ]
letters
u.reindex(letters)
Out[203]:
In [22]:
df1
Out[22]:
In [23]:
df1.query('Age<30')
Out[23]:
In [48]:
tmp = pd.melt(df1, id_vars=['Age', 'City'])
tmp
tmp.pivot(values='value')
In [54]:
df1.sample(2)
Out[54]:
In [25]:
df1.filter(regex='.ender')
Out[25]:
In [26]:
df1['City'].value_counts()
Out[26]:
In [27]:
df1.Gender.value_counts()
Out[27]:
In [36]:
df1
Out[36]:
Grouping
In [44]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import scipy as sc
import pandas as pd
gb = df1.groupby('Gender')
gb.rank(method='dense')
Out[44]:
Lets load some data from the web
In [199]:
import pandas as pd
import pandas_datareader as web
import datetime
start = datetime.datetime(2015, 1, 1)
end = datetime.datetime(2017, 12, 15)
msft = web.DataReader("MSFT", 'yahoo', start, end)
aapl = web.DataReader("AAPL", 'yahoo', start, end)
Display
In [56]:
print(msft[['Open','Close']])
Optionally save to a csv file
In [3]:
msft.to_csv("msft.csv")
aapl.to_csv("aapl.csv")
Read from file
In [7]:
msft2 = pd.read_csv("msft.csv", index_col=0, parse_dates=True)
aapl2 = pd.read_csv("aapl.csv", index_col=0, parse_dates=True)
Check that data is the same
In [11]:
print(msft2.head())
print(msft.head())
Get columns. Each column is a series object.
In [16]:
Open = msft[[0]]
High = msft['High']
print(Open[0:3])
In [37]:
msftA01 = msft.loc['2012-01'][['Adj Close']]
msftA02 = msft['2012-02'][['Adj Close']]
aaplA01 = aapl['2012-01'][['Adj Close']]
msftAV = msft['2012-01'][['Adj Close','Volume']]
aaplAV = aapl['2012-01'][['Adj Close','Volume']]
In [64]:
msftA01
Out[64]:
In [84]:
msft[['Volume','Close']]
Out[84]:
Concatenate
In [22]:
pd.concat([msftA01.head(3), msftA02.head(3)])
Out[22]:
In [29]:
withDups = pd.concat([aaplA01[:3], msftA01[:3]])
print(withDups)
withDups.ix['2012-01-04']
Out[29]:
Multiindex
In [44]:
withDups = pd.concat([aaplA01[:3], msftA01[:3]],keys=['AAPL', 'MSFT'])
print(withDups)
print(withDups.ix['AAPL'])
In [56]:
u = pd.concat([msftAV, aaplAV], keys = ['MSFT','AAPL'])
u.loc['MSFT'].loc['2012-01']
Out[56]:
Merge: Inner and outer joins
In [16]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.figure(figsize=(15,3))
plt.plot(aapl[['Adj Close']])
plt.show()
In [29]:
msft.columns
Out[29]:
In [33]:
msft2 = pd.read_csv("msft.csv", index_col=0, parse_dates=True)
msft2
Out[33]:
In [53]:
plt.plot(aapl['2018-02-03':'2018-05-07']['Volume'])
plt.plot(y['Volume'])
plt.show()
In [69]:
x = aapl['2012-02-05':'2012-02-23']['Open']
y = aapl['2012-02-05':'2012-02-23']['Close']
Vol = aapl['2012-02-05':'2012-02-23']['Volume']
plt.plot(x)
plt.plot(y)
plt.legend()
plt.show()
In [78]:
import numpy as np
plt.plot(np.log(np.array(x.ix[1:10])/np.array(x.ix[0:9])))
plt.show()
Write a program that visualizes a candle stick plot from aapl or msft
In [85]:
plt.plot(np.log(aapl['Adj Close']).diff())
plt.show()
In [96]:
#np.log(aapl['Adj Close']).diff().hist()
x = msft['Adj Close']
np.log(x).diff().hist(bins=100)
plt.gca().set_xlim((-0.2,0.2))
plt.show()
In [200]:
plt.figure(figsize=(10,8))
msft['Open'].rolling(100).mean().plot()
#msft['Open'].expanding().mean().plot()
msft['Open'].plot()
plt.show()
In [71]:
msft.plot.scatter(x='Open', y ='Close')
plt.show()
In [195]:
u = msft[0:5]
In [198]:
u.shift(-1)
Out[198]:
In [79]:
plt.scatter(msft['Open'], msft['Open'].shift(10))
plt.axis('equal')
plt.show()
In [76]:
df1.Age.shift(-1)
Out[76]:
In [80]:
import this
In [190]:
goog = web.DataReader('GOOG', data_source='google',start='3/14/2009', end='4/14/2014')
goog.tail()
Out[190]:
In [191]:
goog.Open.plot()
plt.show()
In [194]:
goog['Log_return'] = np.log(goog['Close']/goog['Close'].shift(-1))
goog['Volatality'] = goog.Log_return.rolling(10).std()
goog.Volatality.plot()
plt.show()
In [98]:
import pandas as pd
plaka = ['01','03','06','16','26','32','34','55','67']
df = pd.DataFrame(data={'City': ['Adana', 'Afyon','Ankara','Bursa','Eskisehir','Isparta','Istanbul','Samsun','Zonguldak'],\
'MegaCity': [True, False, False, True, False, False, True, False, False]}, index=plaka)
In [99]:
df
Out[99]:
In [100]:
df2 = pd.DataFrame(data={'City': ['Icel', 'Antalya'],\
'MegaCity': [False, True]}, index=['33','07'])
In [101]:
df2
Out[101]:
In [104]:
sehirler = pd.concat([df, df2])
In [105]:
sehirler
Out[105]:
In [107]:
sehirler['NumOfCustomers'] = np.random.randint(1,100,11)
In [115]:
sehirler[sehirler.MegaCity == False].NumOfCustomers.sum()
Out[115]:
In [121]:
sehirler.query('City in ["A","a"]')
In [118]:
lst = []
for i in range(len(sehirler)):
if sehirler.City[i][0] == 'A':
lst.append(True)
else:
lst.append(False)
sehirler[lst]
Out[118]:
In [127]:
for u in sehirler.iterrows():
print(u[1].City)
In [138]:
for u in sehirler.iterrows():
s = u[1].City
if s[0] in ['A','a']:
print(s)
In [146]:
sehirler[sehirler.City.apply(lambda x: x[0] in ['A','a'])]
Out[146]:
In [150]:
sehirler.sample(frac=0.5)
Out[150]:
In [155]:
sehirler.groupby(by='MegaCity').count()
Out[155]:
In [156]:
sehirler['Flag'] = sehirler.NumOfCustomers.apply(lambda x: x>50)
In [157]:
sehirler
Out[157]:
In [165]:
sehirler.drop(['Flag'],axis=1, inplace=True)
In [170]:
sehirler.loc['55'].MegaCity
In [175]:
sehirler.MegaCity['55']
Out[175]:
In [176]:
sehirler
Out[176]:
In [187]:
plt.figure(figsize=(10,5))
msft['Open'].plot()
plt.show()
In [182]:
msft['Open']['2015']
Out[182]:
In [246]:
tmp = msft['2015-01-01':'2015-08-01']
m10 = tmp.Close.rolling(10).mean()
In [254]:
#tmp.Close.plot()
tmp.Close.rolling(5).mean().plot()
tmp.Close.rolling(30).mean().plot()
plt.show()
In [225]:
tmp.iloc[4]
Out[225]:
In [228]:
a = tmp.Close.rolling(5).mean()
b = tmp.Close.rolling(30).mean()
x = a - b
In [274]:
import matplotlib.pylab as plt
#x[(np.sign(x.shift(1))*np.sign(x)) < 0].plot(marker='o',linestyle=None)
x.plot(color='g')
ax = plt.gca()
#ax.set_yticks(tmp2.index)
tmp2 = change[change<0]
tmp2.plot(marker='o',style=None)
ax.grid('on')
change = np.sign(x.shift(1))*np.sign(x)
In [270]:
ax.set_yticks(tmp2.index)
Out[270]:
In [269]:
tmp2 = change[change<0]
tmp2
Out[269]:
In [266]:
[u.date() for u in idx]
Out[266]:
In [279]:
[i**3 for i in range(10)]
Out[279]:
In [284]:
d = {i: i**3 for i in range(10)}
d
Out[284]:
A simple object
In [31]:
def my_plot(x):
plt.plot(x)
plt.show()
x = [10,12,45]
my_plot(x)
In [7]:
%matplotlib inline
import matplotlib.pylab as plt
import numpy as np
class my_obj(object):
def __init__(self, x):
self.x = np.random.randn(x)
def plot(self):
plt.plot(self.x)
plt.show()
def __repr__(self):
for i in range(len(self.x)):
print(i, self.x[i])
return str('Merhaba')
u = my_obj(10)
u.plot()
u
Out[7]:
In [17]:
class polynomial(object):
def __init__(self, c, v):
self.coeff = c
self.v = v
def __repr__(self):
D = len(self.coeff)
for i in range(D):
if i<D-1:
print(self.coeff[i], end='')
print('{}^{} '.format(self.v, D-i-1), end='')
else:
print(self.coeff[i])
return str(self.coeff)
p = polynomial([2,3,1], 'z')
p
Out[17]:
In [1]:
import pandas as pd
import pandas_datareader as web
import datetime
start = datetime.datetime(2017, 11, 1)
end = datetime.datetime(2017, 11, 30)
msft = web.DataReader("MSFT", 'yahoo', start, end)
aapl = web.DataReader("AAPL", 'yahoo', start, end)
In [29]:
import matplotlib.pyplot as plt
import numpy as np
u = aapl.Close - aapl.Open
thr = 0.5
plt.bar(u[u<-thr].index, u[u<-thr], color='red')
plt.bar(u[np.abs(u)<thr].index, u[np.abs(u)<thr], color='blue')
plt.bar(u[u>=thr].index, u[u>=thr], color='green')
plt.show()
In [24]:
x = [1,5,4]
y = [10,30,70]
plt.bar(x, y)
plt.show()